{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "Td_alkbOv3Aj",
      "metadata": {
        "id": "Td_alkbOv3Aj"
      },
      "source": [
        "# Spark RAPIDS get_json_object acceleration\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c6ed860b",
      "metadata": {
        "id": "c6ed860b"
      },
      "source": [
        "<a target=\"_blank\" href=\"https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/10min_to_cudf_colab.ipynb\">\n",
        "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
        "</a>\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "AhUsdz6jLdMi",
      "metadata": {
        "id": "AhUsdz6jLdMi"
      },
      "source": [
        "\n",
        "Before getting started - be sure to change your runtime to use a GPU Hardware accelerator! Use the Runtime -> \"Change runtime type\" menu option to add a GPU."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ZfNDlz0SM0DB",
      "metadata": {
        "id": "ZfNDlz0SM0DB"
      },
      "source": [
        "# Let's get started using the RAPIDS Accelerator for Apache Spark"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "PzW61-K04A1E",
      "metadata": {
        "id": "PzW61-K04A1E"
      },
      "outputs": [],
      "source": [
        "!nvidia-smi"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!cat /proc/cpuinfo"
      ],
      "metadata": {
        "id": "OIEun51OCyC4"
      },
      "id": "OIEun51OCyC4",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "spark_version='3.5.0'\n",
        "rapids_version='24.12.0'"
      ],
      "metadata": {
        "id": "NEGt46X7nEqf"
      },
      "id": "NEGt46X7nEqf",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%pip install --quiet \\\n",
        "  pyspark=={spark_version}"
      ],
      "metadata": {
        "id": "g9XK28gcnHiG"
      },
      "id": "g9XK28gcnHiG",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from importlib.resources import files\n",
        "from pyspark.sql import SparkSession\n",
        "import glob\n",
        "import os\n",
        "import re\n",
        "import time\n",
        "import statistics"
      ],
      "metadata": {
        "id": "gr2msGD1nLh-"
      },
      "id": "gr2msGD1nLh-",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "pyspark_files = files('pyspark')\n",
        "spark_sql_jar_path, *_ = glob.glob(f\"{pyspark_files}/*/spark-sql_*jar\")\n",
        "spark_sql_jar = os.path.basename(spark_sql_jar_path)\n",
        "scala_version = re.search(r'^spark-sql_(\\d+.\\d+)-.*\\.jar$', spark_sql_jar).group(1)"
      ],
      "metadata": {
        "id": "0uXK6z8KoFUt"
      },
      "id": "0uXK6z8KoFUt",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "spark = (\n",
        "    SparkSession.builder\n",
        "      .appName('JSON PySpark RAPIDS=ON/OFF')\n",
        "      .config('spark.driver.memory', '5g')\n",
        "      .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')\n",
        "      .config('spark.jars.packages', f\"com.nvidia:rapids-4-spark_{scala_version}:{rapids_version}\")\n",
        "      .getOrCreate()\n",
        ")\n",
        "spark"
      ],
      "metadata": {
        "id": "ayT5VJQvnQv4"
      },
      "id": "ayT5VJQvnQv4",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "location = \"./TMP_DATA\"\n",
        "iters = 3"
      ],
      "metadata": {
        "id": "3VsYyTATpNG1"
      },
      "id": "3VsYyTATpNG1",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def mk_json_column(i):\n",
        "    return \"\"\" '\"', CAST(rand(\"\"\" + str(i) + \"\"\") * 10000 AS LONG), '\":\"\"\" + str(i) + \"\"\"'\"\"\"\n",
        "\n",
        "# generate json lines with very sparse keys\n",
        "spark.range(1000000).selectExpr(\"\"\"concat('{', \"\"\" + (\"\"\", ',' ,\"\"\".join([mk_json_column(i) for i in range(100)])) + \"\"\"'}') as json\"\"\").write.mode(\"overwrite\").parquet(location)"
      ],
      "metadata": {
        "id": "diUi3mxWh91X"
      },
      "id": "diUi3mxWh91X",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Test pulling out a few keys using the GPU\n",
        "spark.conf.set(\"spark.rapids.sql.enabled\",True)\n",
        "gpu_times = []\n",
        "for i in range(iters):\n",
        "    start = time.time()\n",
        "    df = spark.read.parquet(location).selectExpr(\"count(get_json_object(json,'$.0')) as zero\", \"count(get_json_object(json,'$.10')) as ten\", \"count(get_json_object(json,'$.100')) as hundred\", \"count(get_json_object(json,'$.1000')) as thousand\", \"count(get_json_object(json,'$.1001')) as thousandAndOne\", \"avg(octet_length(json)) as len\")\n",
        "    if i == 0:\n",
        "      df.show()\n",
        "    else:\n",
        "      df.collect()\n",
        "    end = time.time()\n",
        "    gpu_times.append(end - start)\n",
        "\n",
        "\n",
        "print(f\"Median execution time of {iters} runs for GPU get_json_object: {statistics.median(gpu_times):.3f}\")"
      ],
      "metadata": {
        "id": "iXaXVgBNt4pK"
      },
      "id": "iXaXVgBNt4pK",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#  Run the same test using the CPU. Note that this is a exceptional result\n",
        "#  because Colab provides very little CPU (2 cores) to go with the GPU (T4)\n",
        "#  on a 16 core AMD CPU that is not overcommited and with an NVMe to load the\n",
        "#  data, and an A6000 GPU, the GPU takes about 0.662 seconds to complete and\n",
        "#  the CPU taks about 2.986 seconds, or about a 4.5x speedup, compared to this\n",
        "#  notebook's ~30x speedup.\n",
        "spark.conf.set(\"spark.rapids.sql.enabled\",False)\n",
        "cpu_times = []\n",
        "for i in range(iters):\n",
        "    start = time.time()\n",
        "    df = spark.read.parquet(location).selectExpr(\"count(get_json_object(json,'$.0')) as zero\", \"count(get_json_object(json,'$.10')) as ten\", \"count(get_json_object(json,'$.100')) as hundred\", \"count(get_json_object(json,'$.1000')) as thousand\", \"count(get_json_object(json,'$.1001')) as thousandAndOne\", \"avg(octet_length(json)) as len\")\n",
        "    if i == 0:\n",
        "      df.show()\n",
        "    else:\n",
        "      df.collect()\n",
        "    end = time.time()\n",
        "    cpu_times.append(end - start)\n",
        "\n",
        "print(f\"Median execution time of {iters} runs for CPU get_json_object: {statistics.median(cpu_times):.3f}\")"
      ],
      "metadata": {
        "id": "lUmVe12Wic5X"
      },
      "id": "lUmVe12Wic5X",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.9.12 ('base')",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.12"
    },
    "vscode": {
      "interpreter": {
        "hash": "5327a248d9883bedf47bfd9e608af95bf318797e621edcc550c6b5b3fdc820cc"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}